library(tidyverse) # ecosystem of data science packages
library(rvest) # for web scrapingAmerican Women Marathoners
1 Description
The data visualization of this module is based on the scatter plot that appears in the following New York Times article by Talya Minsberg and Kevin Quealy
Why Are American Women Running Faster Than Ever? We Asked Them—Hundreds of Them
https://www.nytimes.com/interactive/2020/02/28/sports/womens-olympic-marathon-trials.html
1.1 Details (datasheet)
- Topic(s):
- Sports
- Athletics
- Women
- Marathon
- U.S.
- Data:
- Size: medium data
- Format: data in HTML tables
- Requires web scraping
- Requires some cleansing (e.g. regex)
- Requires merging data
- Graphic:
- Type: scatter plot
- Styles: ggplot, interactive ggiraph, interactive leaflet
- Interactive: no
1.2 R Packages
2 Data
The source of the data is the website World Athletics. The data collection involves web crawling and scraping several pages from this website. All the R code used for this task is in the script web-scraping-world-athletrics.R, located in the folder data/ of this repository.
In this data/ folder, you’ll find various HTML files of the form marathon-women-yyyy.html where yyyy indicates a given year from 2001 to 2024.
2.1 Data Preparation
The commands below import the HTML files in R, scrape the HTML tables into data frames, and extract the Mark and Competitor columns. The main output is a tidy table with columns:
Year: yearMark: Time to complete marathon (string)Competitor: Name of runnerhours: number of hours (fromMark)mins: number of minutes (fromMark)secs: number of seconds (fromMark)time: total time in minutes to complete marathonpace: time (minutes) to run a mile
# output vectors
Year = NULL
Mark = NULL
Competitor = NULL
# import HTML files, and extract columns Mark and Competitor from tables
for (y in 2001:2024) {
tbls = read_html(paste0("data/marathon-women-", y, ".html")) |>
html_table()
tbl_raw = tbls[[1]]
Year = c(Year, rep(y, nrow(tbl_raw)))
Mark = c(Mark, tbl_raw$Mark)
Competitor = c(Competitor, tbl_raw$Competitor)
}
# assemble data
dat = tibble(
Year = Year,
Mark = Mark,
Competitor = Competitor
)
# a bit of cleaning
dat = dat |>
mutate(
hours = as.numeric(str_extract(Mark, "\\d")) * 60,
mins = as.numeric(str_extract(Mark, "\\d{2}")),
secs = as.numeric(str_extract(Mark, "\\d{2}$")) / 60,
time = hours + mins + secs,
pace = time / 26.2188 # marathon in miles
)
head(dat)# A tibble: 6 × 8
Year Mark Competitor hours mins secs time pace
<int> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2001 2:37:57 Magdalena LEWY-BOULET 120 37 0.95 158. 6.02
2 2001 2:39:20 Jill GAITENBY 120 39 0.333 159. 6.08
3 2001 2:40:33 Mary KNISELY 120 40 0.55 161. 6.12
4 2001 2:40:34 Michelle SIMONAITIS 120 40 0.567 161. 6.12
5 2001 2:40:46 Linda SOMERS-SMITH 120 40 0.767 161. 6.13
6 2001 2:41:41 Mary ELLIS 120 41 0.683 162. 6.17
3 Graphics
We go over a series of plots to work out the plotting details.
3.1 Graphic 1
First we do a sanity check:
ggplot(data = dat, aes(x = Year, y = time)) +
geom_point() +
labs(title = "50 fastest U.S. women’s marathons per year",
x = "")3.2 Graphic 2
We specify the point shape to 21 so that we can use colors for fill and border (color)
ggplot(data = dat, aes(x = Year, y = time)) +
geom_point(fill = "tomato", shape = 21, color = "white") +
labs(title = "50 fastest U.S. women’s marathons per year",
x = "")3.3 Graphic 3
We need to reverse the y-axis, this can be done with scale_y_everse(); also we use geom_jitter() to scatter the points along the x-axis
ggplot(data = dat, aes(x = Year, y = pace)) +
geom_jitter(fill = "#eb2f72", shape = 21, color = "white", width = 0.1) +
scale_y_reverse() +
scale_x_continuous(breaks = seq(2001, 2024, by = 2)) +
labs(title = "50 fastest U.S. women’s marathons per year",
x = "")3.4 Graphic 4
Finally, we take case of the rest of details: e.g. grid lines, background, scales, etc.
ggplot(data = dat, aes(x = Year, y = pace)) +
geom_jitter(fill = "#eb2f72", shape = 21, color = "white", width = 0.1) +
scale_y_reverse() +
scale_x_continuous(breaks = seq(2001, 2024, by = 2)) +
labs(title = "50 fastest U.S. women’s marathons per year",
x = "") +
theme(panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_line(color = "gray92"),
panel.background = element_rect(fill = "gray97"))